Murine species trees
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(ggplot2)
library(cowplot)
library(ggbeeswarm)
library(dplyr)
library(kableExtra)
library(tidyr)
library(ggtree)
library(phytools)
library(phangorn)
library(reshape2)
library(ggExtra)
library(ggrepel)
library(vroom)
library(ggdist)
library(stringr)
library(ggsignif)
library(phytools)
library(castor)
library(MCMCtreeR)
library(here)
library(stringr)
library(BAMMtools)
source(here("docs", "scripts", "lib", "design.r"))
source(here("docs", "scripts", "lib", "get_tree_info.r"))
#source("C:/bin/core/r/design.r")
#source("C:/bin/core/r/get_tree_info.r")
#source("C:/Users/grt814/bin/core/corelib/design.r")
#source("C:/Users/grt814/bin/core/get_tree_info.r")
#htmltools::includeHTML("../html-chunks/rmd_nav.html")1 Full coding species tree
- 188 species targeted capture to mouse exons
- Assembled with SPADES
- 11,795 coding loci aligned with exonerate+MAFFT
- Gene trees inferred with IQ-Tree
- Species tree inferred with ASTRAL
- Branch lengths inferred with IQ-Tree using the 866 loci with a normalized RF distance to the species tree of less than 0.25.
# This chunk handles all of the main inputs and reads the tree
# cat("188 species targeted capture to mouse exons assembled with SPADES.\n")
# cat("11,775 coding loci aligned with exonerate+mafft\n")
# cat("Gene trees inferred with IQtree.\n")
# cat("Species tree inferred with ASTRAL.\n")
# cat("Branch lengths estimated by ASTRAL.\n")
# Data summary
###############
#tree_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.labeled.tree")
tree_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.labeled.tree")
# Newick tree file, with tp labels
rodent_tree = read.tree(tree_file)
tree_to_df_list = treeToDF(rodent_tree)
tree_info = tree_to_df_list[["info"]]
# Read the tree and parse with treetoDF
tree_info = tree_info %>% separate(label, c("tp", "support"), sep=">", remove=F)
tree_info$tp = paste(tree_info$tp, ">", sep="")
tree_info = tree_info %>% separate(support, c("astral", "gcf", "scf"), sep="/", remove=T)
# Split the label by /. tp is my treeParse label.
tree_info$astral[tree_info$node.type=="tip"] = NA
# Fill in ASTRAL support as NA for the tips
tree_info$astral = as.numeric(tree_info$astral)
tree_info$gcf = as.numeric(tree_info$gcf)
tree_info$scf = as.numeric(tree_info$scf)
# Convert all supports to numeric
###############
#iq_tree_labels = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.branch")
iq_tree_labels = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.branch")
# Newick tree file, with iqtree labels
###############
#cf_stat_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.stat")
cf_stat_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.stat")
cf_stats = read.table(cf_stat_file, header=T)
# Concordance factor file
###############
#cf_rep_dir = here("docs", "data", "trees", "astral", "concord-rooted", "cf-reps")
cf_rep_dir = here("docs", "data", "trees", "astral", "concord-rooted-bl", "cf-reps")
#delta_outfile = here("docs", "data", "trees", "astral", "concord-rooted", "delta.tab")
delta_outfile = here("docs", "data", "trees", "astral", "concord-rooted-bl", "delta.tab")
# CF reps for delta and delta outfile
###############
col_file = here("docs", "data", "trees", "astral", "astral-colonization-branches.txt")
exclude_branches = c("Lophiomys_imhausi_UM5152", "Lophuromys_woosnami_LSUMZ37793", "<1>", "<187>", "<186>")
# Colonization branches and branches around the root to exclude from some things
###############
gt_file = here("docs", "data", "trees", "gene-trees", "loci-labeled.treefile")
# The file containing the gene trees
###############
#xmax = 31
xmax = 0.175
# The max of the x-axis for tree figures# The node/branch labels in R and IQtree differ. IQtree uses a nice, logical post-ordering
# of internal nodes while R does something random and assigns labels to tips as well. This
# chunk matches those up for the delta analysis later
iq_tree = read.tree(iq_tree_labels)
iqtree_to_df_list = treeToDF(iq_tree)
iqtree_info = iqtree_to_df_list[["info"]]
# Read the IQ tree tree with branch labels in and parse with get_tree_info
#node_convert = matchNodes(tree_to_df_list[["labeled.tree"]], iqtree_to_df_list[["labeled.tree"]], method="descendants")
tree_info$iqtree.node = NA
# Add a column to the main tree table about IQ tree labels
for(i in 1:nrow(tree_info)){
cur_node = tree_info[i,]$node
iqtree_row = subset(iqtree_info, node==cur_node)
iqtree_label = iqtree_row$label
tree_info[i,]$iqtree.node = iqtree_label
}
# For every row in the main tree table, add in the IQ tree node label given that
# we've read the same tree in R and can use the node.labels1.1 Branches colored by gCF
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
gcf_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$gcf)) +
scale_color_continuous(name='gCF', low=l, high=h, limits=c(0,100)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
#gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
# Option to add branch labels to the figure
print(gcf_tree)###############
fig_outfile = here("docs", "figs", "full-coding-astral-rooted-gcf.png")
ggsave(fig_outfile, gcf_tree, width=14, height=14, unit="in")
# Save the tree image
## gCF tree1.2 Branches colored by sCF
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
scf_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$scf)) +
scale_color_continuous(name='sCF', low=l, high=h, limits=c(0,100)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9))
#gcf_tree = gcf_tree + geom_text(aes(x=branch, label=ifelse(tree_info$node.type=="internal",as.character(node), ''), label.size=NA, fill="transparent"), size=2, vjust=-0.2)
# Option to add branch labels to the figure
print(gcf_tree)###############
fig_outfile = here("docs", "figs", "full-coding-astral-rooted-scf.png")
ggsave(fig_outfile, scf_tree, width=14, height=14, unit="in")
# Save the tree image
## gCF tree1.3 Gene vs site concordance factors colored by branch support
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
p = ggplot(tree_info, aes(x=gcf, y=scf, color=astral)) +
geom_point(size=2, alpha=0.5) +
scale_color_continuous(name='Astral support', low=l, high=h, limits=c(0.8,1)) +
bartheme() +
theme(legend.title=element_text(size=12))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-astral-cf.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure1.4 Concordance factors vs. branch lengths
bl_gcf_p = ggplot(tree_info, aes(x=branch.length, y=gcf)) +
geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
scale_x_continuous(limits=c(0,0.03)) +
xlab("Branch length") +
ylab("gCF per branch") +
bartheme()
bl_gcf_p = ggExtra::ggMarginal(bl_gcf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666")
bl_scf_p = ggplot(tree_info, aes(x=branch.length, y=scf)) +
geom_point(size=3, alpha=0.5) +
#geom_text_repel(aes(label=ifelse(avg.ds>0.2|avg.dn>0.01,as.character(node),'')), show_guide=F) +
#geom_smooth(method="lm", se=F, ) +
scale_x_continuous(limits=c(0,0.03)) +
xlab("Branch length") +
ylab("sCF per branch") +
bartheme()
bl_scf_p = ggExtra::ggMarginal(bl_scf_p, type="histogram", bins=50, fill=corecol(pal="wilke", numcol=1, offset=5), color="#666666")
p = plot_grid(bl_gcf_p, bl_scf_p, ncol=2)
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-astral-cf-bl.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure2 Gene trees
The 11,774 gene trees contain varying numbers of taxa due to filtering on the alignment and filtering by IQtree for identical sequences.
2.1 Taxa per gene tree
gene_trees = read.table(gt_file, sep="\t")
names(gene_trees) = c("gene", "tree")
# Read the geen tree file
gene_trees$gene = word(gene_trees$gene, 1, sep="-")
gene_trees$gene = word(gene_trees$gene, 2, sep="/")
# Parse out the protein ID from the filename for each gene
###############
gt_data = data.frame("gene"=c(), "rf"=c(), "num.tips"=c(), "rf.zeros"=c(), "num.tips.zeros"=c())
# A df to track info for each gene tree
for(i in 1:nrow(gene_trees)){
gt = read.tree(text=gene_trees[i,]$tree)
# Read the gene tree as a phylo object
cur_tips = gt$tip.label
pruned_tree = drop.tip(rodent_tree, rodent_tree$tip.label[-match(cur_tips, rodent_tree$tip.label)])
# Get a list of the tips and prune the species tree to contain only those tips
cur_rf = RF.dist(pruned_tree, gt, normalize=T)
# Calculate RF between the pruned species tree and the gene tree
if(cur_rf==0){
gt_data = rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=cur_rf, "num.tips.zeros"=length(cur_tips)))
}else{
gt_data = rbind(gt_data, data.frame("gene"=gene_trees[i,]$gene, "rf"=cur_rf, "num.tips"=length(cur_tips), "rf.zeros"=NA, "num.tips.zeros"=NA))
}
# Add the gene tree info to the df, with a special case for when the RF is 0
}
# Read all the gene trees
p = ggplot(gt_data, aes(x=num.tips)) +
geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=6), color="#666666") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("# of taxa") +
ylab("# of gene trees") +
bartheme()
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-gt-taxa-dist.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure
#rf_file = here("docs", "data", "trees", "astral", "concord-rooted", "loci-nrf-below-0.25.txt")
#write.table(subset(gt_data, rf<=0.25), file=rf_file, row.names=F, quote=F, sep="\t")
# Write info for genes with low RF2.2 Tree distance (RF) between gene trees and species tree
Since RF cannot handle missing taxa, the species tree is pruned for each gene tree to calculate Robinson-Foulds distance. We use the normalized metric since there are varying numbers of tips per gene tree.
p = ggplot(gt_data, aes(x=rf)) +
geom_histogram(bins=50, fill=corecol(pal="wilke", numcol=1, offset=4), color="#666666") +
#geom_quasirandom(size=2, width=0.25, alpha=0.25, color="#666666") +
#geom_boxplot(outlier.shape=NA, alpha=0.75, width=0.5, color="#666666") +
scale_y_continuous(expand=c(0,0)) +
xlab("Normalized RF") +
ylab("# of gene trees") +
bartheme()
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-gt-rf-dist.png")
ggsave(fig_outfile, p, width=6, height=3, unit="in")
# Save the figure3 Delta
3.1 Introduction
For each lineage in the species tree with gCF < 95% we calculated the \(\Delta\) statistic (Huson et al. 2005). This statistic follows the same logic as the ABBA-BABA site patterns used to calculate D-statistics, but uses gene tree topologies instead of alignment sites. Briefly, a given branch in an unrooted tree is defined by a quartet of species groupings with two possible discordant topologies, \(D_1\) and \(D_2\) (see Figure 1 from Minh et al. 2020). Under assumptions that discordance is caused by ILS, both discordant topologies should be present in equal proportions. However, if introgression has occurred one discordant topology will appear more frequently than the other. \(\Delta\) is calculated for a branch as follows, using the frequency of each discordant topology (Vanderpool et al. 2020):
\[\Delta = \frac{D_1 - D_2}{D_1 + D_2}\]
This normalized \(\Delta\) calculation ensures that all values are scaled between 0 and 1, with larger values indicating a larger skew towards one topology, and a higher chance that introgression has occurred.
To test whether the observed \(\Delta\) values are skewed significantly from 0 to imply introgression, we performed concordance factor analysis on 1,000 bootstrap replicates of our inferred gene trees to generate a null distribution of \(\Delta\) values. We then calculated Z-scores and p-values and assessed significance for each branch at a threshold of 0.01.
3.2 Null and actual distributions
Nodes with p < 0.01:
tree_info$delta.sig = F
# Add a column to the tree info table about significant delta values
###############
fig_outfile = here("docs", "figs", "full-coding-delta-dists.png")
# The image file to either generate or insert
if(!file.exists(delta_outfile)){
cf_stats$delta = (abs(cf_stats$gDF1_N - cf_stats$gDF2_N)) / (cf_stats$gDF1_N + cf_stats$gDF2_N)
# Calculate the delta values on the actual data
iqtree_delta = select(cf_stats, ID, delta)
names(iqtree_delta) = c("iqtree.node", "delta")
tree_info = merge(x = tree_info, y = iqtree_delta, by = "iqtree.node", all.x=TRUE)
# Adding the delta values to the main tree info table
tree_info = tree_info[order(tree_info$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work
low_cf_nodes = subset(cf_stats, gCF < 95)
# Get the low concordance factor nodes from the data to test with delta
delta_null = c()
for(i in 0:999){
cur_rep_str = as.character(i)
while(nchar(cur_rep_str) < 4){
cur_rep_str = paste("0", cur_rep_str, sep="")
}
# Handling the string of the rep
cur_cf_file = paste(cf_rep_dir, "/rep", cur_rep_str, ".cf.stat", sep="")
cf_rep = read.table(cur_cf_file, header=T, fill=T)
# Read the current reps cf file
cf_rep$delta = (abs(cf_rep$gDF1_N - cf_rep$gDF2_N)) / (cf_rep$gDF1_N + cf_rep$gDF2_N)
delta_null = c(delta_null, cf_rep$delta)
# Calculate delta for this rep and save values in vector
}
# Read concordance factors from bootstrap samples of gene trees and calculate delta to generate
# null distribution
###############
delta_null_df = data.frame("delta"=delta_null, y="duh")
delta_null_df = subset(delta_null_df, !is.nan(delta))
# Convert the delta values to a data frame for ggplot
delta_mu = mean(delta_null_df$delta, na.rm=T)
delta_sd = sd(delta_null_df$delta, na.rm=T)
# Calculate the mean and sd of the null distribution to get z-scores and p-values
###############
delta_out = data.frame("node"=c(), "delta"=c(), "z-score"=c(), "p-value"=c())
# Initialize output data frame
for(i in 1:nrow(low_cf_nodes)){
row = low_cf_nodes[i,]
z = (row$delta - delta_mu) / delta_sd
p = pnorm(-abs(z))
if(p < 0.01){
print(paste(row$ID, row$delta, z, p, sep=" "))
tree_info$delta.sig[tree_info$iqtree.node==row$ID] = T
# Set the delta significant to TRUE in the main tree info table
}
delta_out = rbind(delta_out, data.frame("node"=row$ID, "delta"=row$delta, "z-score"=z, "p-value"=p))
}
# Calculate z and p for each low gCF node in the species tree and save to output data frame
write.table(file=delta_outfile, delta_out, sep="\t", row.names=F)
# Write the delta values per node to the output file
###############
delta_null_p = ggplot(delta_null_df, aes(x=delta)) +
geom_histogram(color="#ececec", bins=50) +
scale_x_continuous(limits=c(0,1)) +
scale_y_continuous(expand=c(0,0)) +
xlab("Delta") +
ylab("# nodes") +
bartheme()
delta_actual_p = ggplot(low_cf_nodes, aes(x=delta)) +
geom_histogram(fill="#920000", color="#ececec") +
scale_x_continuous(limits=c(0,1)) +
scale_y_continuous(expand=c(0,0)) +
xlab("Delta") +
ylab("# nodes") +
bartheme()
delta_p = plot_grid(delta_null_p, delta_actual_p, ncol=2, labels=c("Bootstrap distribution", "Actual distribution"), label_size=12)
print(delta_p)
###############
ggsave(fig_outfile, delta_p, width=10, height=4, unit="in")
# Save the figure
}else{
tree_info$delta = NA
delta_out = read.table(delta_outfile, header=T)
for(i in 1:nrow(delta_out)){
row = delta_out[i,]
tree_info$delta[tree_info$iqtree.node==row$node] = row$delta
if(row$p.value < 0.01){
print(paste(row$node, row$delta, row$z.score, row$p.value, sep=" "))
tree_info$delta.sig[tree_info$iqtree.node==row$node] = T
# Set the delta significant to TRUE in the main tree info table
}
}
knitr::include_graphics(fig_outfile)
}## [1] "219 0.406732117812062 2.44516394160885 0.00723931554647087"
## [1] "222 0.407151095732411 2.44879368056455 0.00716677631309369"
## [1] "232 0.622047244094488 4.31050734912284 8.14402076760321e-06"
## [1] "260 0.416606498194946 2.53070883884166 0.00569161487327277"
## [1] "272 0.531951640759931 3.52998048773703 0.000207795157849876"
## [1] "273 0.464877663772691 2.9488972960955 0.00159454968995379"
## [1] "317 0.510204081632653 3.34157446746339 0.000416523334864317"
## [1] "324 0.407725321888412 2.45376838435342 0.00706840016380011"
3.3 Branches by delta
Branches with significant delta are labeled
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
delta_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$delta)) +
scale_color_continuous(name="Delta", high=h, low=l) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9)) +
geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(delta_tree)###############
fig_outfile = here("docs", "figs", "full-coding-astral-delta.png")
ggsave(fig_outfile, delta_tree, width=14, height=14, unit="in")
# Save the tree image3.4 Branches with significant delta
h = corecol(numcol=1, pal="wilke", offset=1)
l = corecol(numcol=1, offset=1)
intro_tree = ggtree(rodent_tree, size=0.8, ladderize=F, aes(color=tree_info$delta.sig)) +
scale_color_manual(name="Significant Delta", labels=c("False", "True"), values=corecol(pal="trek", numcol=2)) +
xlim(0, xmax) +
geom_tiplab(color="#333333", fontface='italic', size=2) +
theme(legend.position=c(0.05,0.9)) +
geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(intro_tree)3.5 Concordance factors and delta
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
p = ggplot(tree_info, aes(x=gcf, y=scf, color=delta)) +
geom_point(size=2, alpha=0.5) +
geom_text_repel(aes(label=ifelse(delta.sig,as.character(tp),'')), show_guide=F, min.segment.length=0) +
scale_color_continuous(name='Delta', low=l, high=h) +
xlab("gCF") +
ylab("sCF") +
bartheme() +
theme(legend.title=element_text(size=12))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-cf.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure3.6 Branch length and delta
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
# Colors
p = ggplot(tree_info, aes(x=branch.length, y=delta, color=delta)) +
geom_point(size=2, alpha=0.5) +
geom_text_repel(aes(label=ifelse(delta.sig,as.character(tp),'')), show_guide=F, min.segment.length=0) +
scale_x_continuous(limits=c(0,0.03)) +
#scale_color_continuous(name='Delta', low=l, high=h) +
xlab("Branch length") +
ylab("Delta") +
bartheme() +
theme(legend.title=element_text(size=12))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-bl.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure3.7 Species tree without tips
#no_tip_file = here("docs", "data", "trees", "astral", "concord-rooted", "full_coding_iqtree_astral.cf.labeled.notips.tree")
no_tip_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "full_coding_iqtree_astral_rooted_bl.cf.labeled.notips.tree")
tree2 = read.tree(no_tip_file)
tree_to_df_list = treeToDF(tree2)
tree2_info = tree_to_df_list[["info"]]
names(tree2_info)[5] = "tp"
# Read the tree and parse with treetoDF
tree_info_notip = subset(tree_info, tp %in% tree2_info$tp)
tree_info_notip = tree_info_notip %>% select(tp, delta, delta.sig)
tree2_info = left_join(tree2_info, tree_info_notip, by="tp")
# Add the delta values for each node to the no tip tree
tree2_tip_info = subset(tree2_info, node.type=="tip")
## No tip tree
###############
tree3 = drop.tip(tree2, "<1>")
tree_to_df_list = treeToDF(tree3)
tree3_info = tree_to_df_list[["info"]]
names(tree3_info)[5] = "tp"
# Read the tree and parse with treetoDF
tree_info_notip = subset(tree_info, tp %in% tree2_info$tp)
tree_info_notip = tree_info_notip %>% select(tp, delta, delta.sig)
tree3_info = left_join(tree3_info, tree_info_notip, by="tp")
# Add the delta values for each node to the no tip tree
tree3_tip_info = subset(tree3_info, node.type=="tip")
## No tip tree, no outgroup
###############
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
#no_tip_max = 13
no_tip_max = 0.13
delta_tree = ggtree(tree2, size=0.8, ladderize=F, aes(color=tree2_info$delta)) +
scale_color_continuous(name="Delta", high=h, low=l) +
xlim(0, no_tip_max) +
geom_tiplab(color="#333333", fontface='italic', size=5) +
theme(legend.position=c(0.05,0.9))
#geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(delta_tree)###############
fig_outfile = here("docs", "figs", "full-coding-astral-delta-notips.png")
ggsave(fig_outfile, delta_tree, width=12, height=12, unit="in")
# Save the tree image3.8 Phylogenetic signal of delta (Pagel’s lambda, Bloomberg K)
# Generally helpful: https://lukejharmon.github.io/pcm/chapter6_beyondbm/
phylosig(tree2, tree2_tip_info$delta, test=T)## [1] "x has no names; assuming x is in the same order as tree$tip.label"
## [1] "some data in x given as 'NA', dropping corresponding species from tree"
##
## Phylogenetic signal K : 0.170244
## P-value (based on 1000 randomizations) : 0.21
# Bloomberg K
phylosig(tree2, tree2_tip_info$delta, method="lambda", test=T)## [1] "x has no names; assuming x is in the same order as tree$tip.label"
## [1] "some data in x given as 'NA', dropping corresponding species from tree"
##
## Phylogenetic signal lambda : 6.6107e-05
## logL(lambda) : 50.1434
## LR(lambda=0) : -0.00221458
## P-value (based on LR test) : 1
#Pagel's lambda
###############
#new_tree = drop.tip(no_tip_tree, "<1>")
#new_info = subset(new_tip_info, tp != "<1>")
#a = data.frame(x=new_info$branch.length, y=new_info$delta, names=new_info$tp, row.names=new_info$tp)
#picModel <- pic.pgls(formula = y ~ x, phy = new_tree, y = a, lambda = "ML", return.intercept.stat = FALSE)
# PGLS.. not sure how to interpret
# https://cran.r-project.org/web/packages/motmot/vignettes/motmot.html#fast-estimation-of-phylogenetic-generalised-least-squares
# Or maybe this is better: https://lukejharmon.github.io/ilhabela/instruction/2015/07/03/PGLS/
###############3.9 Phylogenetic independent contrasts of delta
PIC are basically the expected differences in trait values between pairs of branches descending from a node in a tree corrected for expected variance, which is just the sum of the branch lengths. For example, for a given node \(N\) in a tree, the PIC will be:
\[PIC(N) = \frac{V_1 - V_2}{L_1 + L_2}\]
With a post-order traversal of the tree, PICs can be calculated for each node in the tree. Generally, for internal nodes, a trait value \(V\) is assigned after its PIC has been calculated so it can be used in calculation of the PIC of its ancestor.
\[V = \frac{(1/L_1)V_1+(1/L_2)V_2}{1/L_1+1/L_2}\]
The branch is also lengthened to account for possible error in \(V\).
Finally, the rate of trait evolution \(R\) is calculated as
\[R = \frac{\sum ^{n}{PIC_n}}{n}\]
where \(n\) is the number of internal nodes in the tree and \(PIC_n\) is the PIC calculated for that node.
However, we have no need to estimate values \(V\) at internal nodes in our tree since we have delta values for each branch. Here I compare both the contrasts themselves and the rate estimated from them when using the inferred ancestral states or the observed states in our tree.
# Helpful:
# https://bio.libretexts.org/Bookshelves/Evolutionary_Developmental_Biology/Phylogenetic_Comparative_Methods_(Harmon)/04%3A_Fitting_Brownian_Motion/4.02%3A_Estimating_Rates_using_Independent_Contrasts
# https://lukejharmon.github.io/ilhabela/instruction/2015/07/02/phylogenetic-independent-contrasts/
inferred_pic = pic(tree3_tip_info$delta, tree3, var.contrasts=T, rescaled.tree=T)
# Compute the contrasts based on the 'tip' values from the tree
inferred_rate = sum(inferred_pic$contr[,1]^2)/nrow(inferred_pic$contr)
# Compute the rate from the inferred constrasts
###############
tree2_info$pic = NA
tree2_info$inferred.pic = NA
# Columns to add to the tree info
num_contrasts = 0
# Keep track of the number of contrasts done to calculate rate later
for(i in 1:nrow(tree2_info)){
row = tree2_info[i,]
if(row$node.type == "tip" || row$node.type=="ROOT"){
next
}
# Skip tips and the root since we can't calculate on them
tree2_info$inferred.pic[tree2_info$tp==row$tp] = inferred_pic[["contr"]][,1][[row$tp]]
# Add the inferred pic to the tree info df here
desc = getDescendants(tree2, row$node)
desc1 = tree2_info[tree2_info$node==desc[1],]
desc2 = tree2_info[tree2_info$node==desc[2],]
# Get the descendants of the current node
pic = (desc1$delta - desc2$delta) / (desc1$branch.length + desc2$branch.length)
# Calculate the contrast based on the descendant deltas and branch lenghts
tree2_info$pic[tree2_info$node==row$node] = pic
# Add the contrast to the tree info df
num_contrasts = num_contrasts + 1
# Increment number of contrasts
}
observed_rate = sum(tree2_info$pic^2, na.rm=t)/num_contrasts
# Compute the rate from the observed contrasts
## Compute contrasts from observed delta
###############
plot_info = subset(tree2_info, node.type != "tip" & !is.na(delta))
#a = subset(no_tip_info, node.type != "tip")
a = select(plot_info, tp, pic, delta.sig)
a$label = "Observed"
b = select(plot_info, tp, inferred.pic, delta.sig)
names(b)[2] = "pic"
b$label = "Inferred"
z = rbind(a,b)
x_comps = list(c("Observed", "Inferred"))
p = ggplot(data=z, aes(x=label, y=pic, group=label, color=delta.sig)) +
geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1, test="ks.test") +
#scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=6), corecol(numcol=1, pal="wilke", offset=4))) +
xlab("") +
ylab("PIC") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(p)cat("KS test. Note that distributions are not significantly different with Wilcox test\n")## KS test. Note that distributions are not significantly different with Wilcox test
###############
fig_outfile = here("docs", "figs", "full-coding-delta-pic.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure
###############
rate_table = data.frame("Rate type"=c("Inferred", "Observed"), "Rate"=c(inferred_rate, observed_rate))
rate_table %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)| Rate.type | Rate |
|---|---|
| Inferred | 1.084021 |
| Observed | 374.168588 |
# Table to display rates3.10 Ancestral reconstruction of delta values
#new_tree = drop.tip(no_tip_tree, "<1>")
# tree_to_df_list = treeToDF(tree2_no_out)
# tree2_no_out_ = tree_to_df_list[["info"]]
# names(new_tree_info)[5] = "tp"
# Drop the outgroup and re-read the tree
#new_info = subset(new_tip_info, tp != "<1>")
###############
a = data.frame(delta=tree3_tip_info$delta, row.names=tree3_tip_info$tp)
b = as.matrix(a)[,1]
fit = fastAnc(tree3, b, vars=T,CI=T)
# Transform delta values from tips and infer ancestral states
#obj<-contMap(new_tree,b,plot=FALSE)
#plot(obj,type="fan",legend=0.7*max(nodeHeights(new_tree)),
# fsize=c(0.7,0.9))
# See http://www.phytools.org/eqg2015/asr.html
###############
tree2_info$delta.recon = NA
for(i in 1:nrow(tree2_info)){
row = tree2_info[i,]
if(row$node.type == "tip" || row$node.type=="ROOT"){
next
}
tp_node = row$tp
node = row$node
new_node = tree3_info$node[tree3_info$tp==tp_node]
tree2_info$delta.recon[tree2_info$tp==row$tp] = fit[["ace"]][[as.character(new_node)]]
}
# Add the inferred delta values to the main data frame for the no tip tree
###############
# delta_tree = ggtree(fit$rescaled.tree, size=0.8, ladderize=F) +
# #scale_color_continuous(name="Delta", high=h, low=l) +
# xlim(0, 0.2) +
# geom_tiplab(color="#333333", fontface='italic', size=5) +
# theme(legend.position=c(0.05,0.9))
# #geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
# #geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
# #geom_nodepoint(color="#666666", alpha=0.85, size=4)
# print(delta_tree)
h = corecol(numcol=1, pal="wilke", offset=3)
l = corecol(numcol=1, offset=3)
#no_tip_max = 13
no_tip_max = 0.13
delta_tree = ggtree(tree2, size=0.8, ladderize=F, aes(color=tree2_info$delta.recon)) +
scale_color_continuous(name="Delta", high=h, low=l) +
xlim(0, no_tip_max) +
geom_tiplab(color="#333333", fontface='italic', size=5) +
theme(legend.position=c(0.05,0.9))
#geom_label(aes(x=branch, label=ifelse(tree_info$delta.sig,as.character(tree_info$tp),'')), label.size=NA, fill="transparent", show.legend=F, vjust=-0.1)
#geom_text(aes(label=rodent_data$support), hjust=-.1, color="#006ddb") +
#geom_nodepoint(color="#666666", alpha=0.85, size=4)
print(delta_tree)# Tree with ancestral reconstructions of delta
###############
fig_outfile = here("docs", "figs", "full-coding-astral-delta-recon-notips.png")
ggsave(fig_outfile, delta_tree, width=12, height=12, unit="in")
# Save the tree image# w = subset(tree2_info, node.type=="tip")
# w = select(w, tp, delta)
# w$label = "delta.recon"
x = subset(tree2_info, node.type != "tip")
x = select(x, tp, delta, delta.sig)
x$label = "Observed"
y = select(tree2_info, tp, delta.recon, delta.sig)
names(y)[2] = "delta"
y$label = "Inferred"
z = rbind(x,y)
# Convert to long with two categories
###############
x_comps = list(c("Observed", "Inferred"))
p = ggplot(z, aes(x=label, y=delta, group=label, color=delta.sig)) +
geom_quasirandom(size=2, width=0.25, alpha=0.5) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("Delta") +
xlab("") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-recon.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure3.11 Delta differences to ancestors
My own attempt at some sort of phylogenetic independent comparison.
For each branch, compute the difference in the value delta between it and its ancestor and the difference between it and a random branch in the tree. Repeat random sampling as needed.
delta_info = subset(tree_info, !is.na(delta))
# Get info only from branches with delta values
#delta_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())
delta_anc_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c())
delta_random_diffs = data.frame("ref.node"=c(), "ref.delta"=c(), "sig.ref.delta"=c(), "query.node"=c(), "query.delta"=c(), "query.type"=c(), "delta.diff"=c(), "replicate"=c())
# Data frames to save delta diffs
for(i in 1:nrow(delta_info)){
# Loop over every branch with delta
node = delta_info[i,]$node
delta = delta_info[i,]$delta
sig_delta = delta_info[i,]$delta.sig
sig_label = "N"
if(sig_delta){
sig_label = "Y"
}
# Get the delta value and other info from the branch
anc = Ancestors(rodent_tree, node, type="parent")
if(tree_info[tree_info$node==anc,]$tp %in% exclude_branches){
next
}
# Get the ancestral node, but move on if it is an excluded branch
anc_delta = delta_info[delta_info$node==anc,]$delta
if(delta_info[delta_info$node==anc,]$delta.sig){
sig_label = "Ancestor Y"
}
# Get the delta info from the ancestor
#delta_info[delta_info$node==node,]$delta.diff = abs(delta - anc_delta)
delta_anc_diffs = rbind(delta_anc_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=anc, "query.delta"=anc_delta, "query.type"="Ancestor", "delta.diff"=abs(delta-anc_delta)))
# Add the info of the comparison of this node to its ancestor to the anc delta diff data frame
for(j in 1:100){
# Select random branches
random_anc = anc
while(random_anc == anc || random_anc == node){
random_row = delta_info[sample(nrow(delta_info),1),]
random_anc = random_row$node
}
# Select a random branch in the tree as long as it is not this branch or its ancestor
delta_random_diffs = rbind(delta_random_diffs, data.frame("ref.node"=node, "ref.delta"=delta, "sig.ref.delta"=sig_label, "query.node"=random_anc, "query.delta"=random_row$delta, "query.type"="Random branch", "delta.diff"=abs(delta-random_row$delta), "replicate"=j))
# Add the info of the comparison of this node to the random branch to the random delta diff data frame
}
#delta_info[delta_info$node==node,]$random.diff = abs(delta - random_row$delta)
}
###############
x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make
delta_random_single = subset(delta_random_diffs, replicate==1)
delta_random_single = select(delta_random_single, -replicate)
delta_random_single$query.type = "Random branch"
delta_diffs_single = rbind(delta_anc_diffs, delta_random_single)
# Get the first replicate, add label, and combine with anc diffs
delta_diff_single_p = ggplot(data=delta_diffs_single, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
ggtitle("Single random sample") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_single_p)# Distribution of delta diffs to ancestor and one random branch
###############
x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make
delta_random_meds = delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=median(delta.diff))
delta_diffs_meds = rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_meds)
# Calculate median of all random branch diffs and combine with anc diffs
delta_diff_med_p = ggplot(data=delta_diffs_meds, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
#ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
ggtitle("Median after 100 random samples") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_med_p)# Distribution of delta diffs to ancestor and median over many random branches
###############
x_comps = list(c("Ancestor", "Random branch"))
# List of comparisons to make
delta_random_avgs = delta_random_diffs %>% group_by(ref.node, query.type, sig.ref.delta) %>% summarize(delta.diff=mean(delta.diff))
delta_random_avgs$query.type = "Random branch"
delta_diffs_avgs = rbind(select(delta_anc_diffs, ref.node, query.type, sig.ref.delta, delta.diff), delta_random_avgs)
# Calculate average of all random branch diffs and combine with anc diffs
delta_diff_avg_p = ggplot(data=delta_diffs_avgs, aes(x=query.type, y=delta.diff, group=query.type, color=sig.ref.delta)) +
geom_quasirandom(size=2, width=0.25, alpha=0.30) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_y_continuous(limits=c(0,0.7)) +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=4), corecol(numcol=1, pal="wilke", offset=3), "#920000")) +
#ggtitle("Difference between delta and ancestral delta / average after 100 replicates of random sampling") +
ggtitle("Mean after 100 random samples") +
xlab("") +
ylab("Delta difference") +
bartheme() +
theme(axis.text.x=element_text(angle=15, hjust=1),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm")) +
guides(colour=guide_legend(override.aes=list(alpha=1)))
print(delta_diff_avg_p)# Distribution of delta diffs to ancestor and average over many random branches
###############
p_legend = get_legend(delta_diff_single_p)
p_grid = plot_grid(delta_diff_single_p + theme(legend.position="none"),
delta_diff_med_p + theme(legend.position="none"),
delta_diff_avg_p + theme(legend.position="none"),
ncol=3)
p = plot_grid(p_grid, p_legend, nrow=2, rel_heights=c(1,0.1), align='vh')
fig_outfile = here("docs", "figs", "full-coding-delta-anc-diffs.png")
ggsave(fig_outfile, p, width=12, height=4, unit="in")
# Save the figure4 Delta spatial patterns
Here, we look for evidence of introgression along the genome by calculating the average size of blocks of genes that share the same topology. For all genes that form a block of topologies on a given branch (e.g. all genes in a row that share the same topology), we subtract the starting position of the final gene from the starting position of the first gene in the block.
We then look at the sizes of blocks around each branch for each topology: 1. the concordant topology, 2. the major discordant topology (the one that is most frequent), and 3. the minor discordant topology.
Specifically, we take the difference in the average block size of the major and minor topologies to measure if the skew in gene tree topologies is observable in different areas of the genome.
topo_file = here("docs", "data", "trees", "astral", "concord-rooted-bl", "topo-counts.tab")
topo_counts = read.table(topo_file, header=T)
topo_counts = subset(topo_counts, !tp.node %in% exclude_branches)
topo_counts = subset(topo_counts, mchr != "X" & mchr != "Y")
# Read the topo counts for each gene for each branch
###############
block_outfile = here("docs", "data", "trees", "astral", "concord-rooted-bl", "blocks.tab")
# A file to save the block sizes to
###############
tree_info$depth = node.depth(rodent_tree)
#tree_info$depth.bl = get_all_node_depths(rodent_tree)
tree_info$depth.edge = NA
tree_info$depth.bl = NA
tree_info$depth.edge[189:nrow(tree_info)] = get_all_node_depths(rodent_tree, as_edge_count=T)
tree_info$depth.bl[189:nrow(tree_info)] = get_all_node_depths(rodent_tree)
# Depths of branches in the tree relative to tips
###############
topo_tree_info = subset(tree_info, node.type!="tip" & !tp %in% exclude_branches)
topo_cf_stats = cf_stats
names(topo_cf_stats)[1] = "node"
topo_cf_stats = merge(topo_cf_stats, select(topo_tree_info, node, node.type, branch.length, depth, depth.bl, depth.edge, tp, delta, delta.sig), by="node")
# Add the tree info to the cf table
topo_cf_stats$gD_prev = ifelse(topo_cf_stats$gDF1_N > topo_cf_stats$gDF2_N, "d1", "d2")
# Identify the prevalant discorant topology for each branchif(!file.exists(block_outfile)){
block_data = data.frame("tp"=c(), "num.genes.c"=c(), "num_genes_d1"=c(), "num_genes_d2"=c(), "num.genes.m"=c(),
"c.avg.block.size"=c(), "c.avg.block.count"=c(), "c.num.blocks"=c(),
"d1.avg.block.size"=c(), "d1.avg.block.count"=c(), "d1.num.blocks"=c(),
"d2.avg.block.size"=c(), "d2.avg.block.count"=c(), "d2.num.blocks"=c(),
"m.avg.block.size"=c(), "m.avg.block.count"=c(), "m.num.blocks"=c())
# Block sizes for each topology around each branch
for(node in levels(as.factor(topo_counts$tp.node))){
# Loop over every node in the tree
c_blocks = 0
c_block_sizes = c()
c_block_counts = c()
d1_blocks = 0
d1_block_sizes = c()
d1_block_counts = c()
d2_blocks = 0
d2_block_sizes = c()
d2_block_counts = c()
m_blocks = 0
m_block_sizes = c()
m_block_counts = c()
# Running tallies and lists of block sizes for each topology to be averaged
# over all chromosomes
num_genes_c = 0
num_genes_d1 = 0
num_genes_d2 = 0
num_genes_m = 0
for(chr in levels(as.factor(topo_counts$mchr))){
# Loop over every mouse chromosome
#print(paste(node, chr))
chrdata = subset(topo_counts, tp.node==node & mchr==chr)
# Subset topo counts from this chromosome and node
chrdata = chrdata[order(chrdata$mstart), ]
# Order the genes by their starting position on the chrome
chrdata$row.num = seq_along(chrdata[,1])
# Add the row number as a column
first = T
cur_topo = NA
num_genes = 0
# Initialize some things for this chromosomes
for(i in 1:nrow(chrdata)){
# Loop over every gene on this chromosomes
cur_gene = chrdata[i,]
# Get the current gene
if(first){
cur_topo = cur_gene$topo
num_genes = 0
block_start = cur_gene$mstart
first = F
next
}
# If this is the first gene of this chromosome/node combo, start the first block
if(cur_gene$topo == "m"){
num_genes_m = num_genes_m + 1
}else if(cur_gene$topo == "c"){
num_genes_c = num_genes_c + 1
}else if(cur_gene$topo == "d1"){
num_genes_d1 = num_genes_d1 + 1
}else if(cur_gene$topo == "d2"){
num_genes_d2 = num_genes_d2 + 1
}
# Count the topology of the current gene... I probably don't need to do this...
if(!cur_gene$topo %in% cur_topo || i == nrow(chrdata)){
block_size = cur_gene$mstart - block_start
if(cur_topo == "m"){
m_blocks = m_blocks + 1
m_block_sizes = c(m_block_sizes, block_size)
m_block_counts = c(m_block_counts, num_genes)
}else if(cur_topo == "c"){
c_blocks = c_blocks + 1
c_block_sizes = c(c_block_sizes, block_size)
c_block_counts = c(c_block_counts, num_genes)
}else if(cur_topo == "d1"){
d1_blocks = d1_blocks + 1
d1_block_sizes = c(d1_block_sizes, block_size)
d1_block_counts = c(d1_block_counts, num_genes)
}else if(cur_topo == "d2"){
d2_blocks = d2_blocks + 1
d2_block_sizes = c(d2_block_sizes, block_size)
d2_block_counts = c(d2_block_counts, num_genes)
}
# Get the size of the current block and add it to the correct list
cur_topo = cur_gene$topo
num_genes = 0
block_start = cur_gene$mstart
# Reset the block counts and positions
}else{
num_genes = num_genes + 1
}
# If there is a decisive topology for this gene/node combo AND the topology doesn't match the
# previous topology .. OR if this is the last gene of the chromosome, then we need to count the
# size of the block and start the next block
# Otherwise, just increment the number of genes
}
## End gene loop
}
## End chromosome loop
block_data = rbind(block_data, data.frame("tp"=node,
"num.genes.c"=num_genes_c, "num_genes_d1"=num_genes_d1, "num_genes_d2"=num_genes_d2, "num.genes.m"=num_genes_m,
"c.avg.block.size"=mean(c_block_sizes, na.rm=T), "c.avg.block.count"=mean(c_block_counts, na.rm=T), "c.num.blocks"=c_blocks,
"d1.avg.block.size"=mean(d1_block_sizes, na.rm=T), "d1.avg.block.count"=mean(d1_block_counts, na.rm=T), "d1.num.blocks"=d1_blocks,
"d2.avg.block.size"=mean(d2_block_sizes, na.rm=T), "d2.avg.block.count"=mean(d2_block_counts, na.rm=T), "d2.num.blocks"=d2_blocks,
"m.avg.block.size"=mean(m_block_sizes, na.rm=T), "m.avg.block.count"=mean(m_block_counts, na.rm=T), "m.num.blocks"=m_blocks))
# Calculate average block sizes for this node over all chromosomes and add to the main df
}
## End node loop
write.table(file=block_outfile, block_data, sep="\t", row.names=F)
# Write the block sizes per node to a file
}else{
block_data = read.table(block_outfile, header=T)
}
# If the block sizes have already been calculated, just read them in
block_data_merge = merge(block_data, topo_cf_stats, by="tp")
# Merge the block counts into the main cf df
###############
block_data_merge$anc.delta.sig = "N"
block_data_merge$desc.delta.sig = "N"
for(i in 1:nrow(block_data_merge)){
# Loop over every branch with delta
node = block_data_merge[i,]$node
# Get the node of the current row
anc = Ancestors(rodent_tree, node, type="parent")
if(!anc %in% topo_tree_info$node){
next
}
# Get the ancestral node, but move on if it is an excluded branch
anc_delta = block_data_merge[block_data_merge$node==anc,]$delta
if(block_data_merge[block_data_merge$node==anc,]$delta.sig == "Y"){
block_data_merge[i,]$anc.delta.sig = "Y"
}
# Get the delta info from the ancestor
for(desc in getDescendants(rodent_tree, node)){
if(!desc %in% topo_tree_info$node){
next
}
desc_delta = block_data_merge[block_data_merge$node==desc,]$delta
if(block_data_merge[block_data_merge$node==desc,]$delta.sig == "Y"){
block_data_merge[i,]$desc.delta.sig = "Y"
}
# Get the delta info from the ancestor
}
}
## Determine whether ancestor or either descendant has a significant delta for each branch
###############
block_data_merge$major.block.size = ifelse(block_data_merge$gD_prev=="d1", block_data_merge$d1.avg.block.size, block_data_merge$d2.avg.block.size)
block_data_merge$minor.block.size = ifelse(block_data_merge$gD_prev=="d1", block_data_merge$d2.avg.block.size, block_data_merge$d1.avg.block.size)
block_data_merge$block.diff = block_data_merge$major.block.size - block_data_merge$minor.block.size
block_data_merge$block.ratio = block_data_merge$major.block.size / block_data_merge$minor.block.size
# Finally, determine the block sizes for the major and minor topologies, and calculate their differences4.1 Distribution of block sizes for each topology around each branch
m = select(block_data_merge, tp, m.avg.block.size, delta.sig)
names(m)[2] = "block.size"
m$label = "Missing"
c = select(block_data_merge, tp, c.avg.block.size, delta.sig)
names(c)[2] = "block.size"
c$label = "Concordant"
d1 = select(block_data_merge, tp, d1.avg.block.size, delta.sig)
names(d1)[2] = "block.size"
d1$label = "D1"
d2 = select(block_data_merge, tp, d2.avg.block.size, delta.sig)
names(d2)[2] = "block.size"
d2$label = "D2"
a = rbind(c, d1, d2, m)
#x_comps = list(c("Observed", "Inferred"))
p = ggplot(a, aes(x=label, y=block.size, group=label, color=delta.sig)) +
geom_quasirandom(size=2, width=0.25, alpha=0.5) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
ylab("Avg. block size") +
xlab("") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-block-dists.png")
ggsave(fig_outfile, p, width=4, height=4, unit="in")
# Save the figure4.2 Distribution of the difference between major and minor block sizes
Note that a negative value indicates that, on average, blocks consisting of the minor topology are larger than those of the major topology.
###############
p = ggplot(block_data_merge, aes(x=1, y=block.diff, color=delta.sig)) +
geom_quasirandom(size=2, width=0.25, alpha=0.5) +
geom_boxplot(outlier.shape=NA, alpha=0.15, width=0.5, color="#666666") +
#geom_signif(comparisons=x_comps, map_signif_level=TRUE, textsize=4, size=1, step_increase=0.12, margin_top=0.1) +
scale_x_continuous(limits=c(0,2)) +
ylab("Block size difference") +
xlab("") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
plot.title = element_text(size=14, vjust=1),
legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-block-diff-dist.png")
ggsave(fig_outfile, p, width=4, height=4, unit="in")
# Save the figure4.3 Block size difference and delta
# p = ggplot(block_data_merge, aes(x=block.diff, y=num.genes.na, color=delta.sig)) +
# geom_point(size=3, alpha=0.5) +
# geom_smooth(method="lm", se=F) +
# geom_text_repel(aes(label=ifelse(delta.sig=="Y",as.character(tp),'')), show_guide=F, min.segment.length=0) +
# bartheme()
# print(p)
p = ggplot(block_data_merge, aes(x=delta, y=block.diff, color=delta.sig)) +
geom_point(size=2, alpha=0.5) +
geom_smooth(method="lm", se=F, linetype="dashed", color="#333333") +
xlab("Delta") +
ylab("Block size difference") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-block-diff-delta.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure4.4 Block size difference and tree depth
Maximum number of branches from a tip
# p = ggplot(block_data_merge, aes(x=block.diff, y=num.genes.na, color=delta.sig)) +
# geom_point(size=3, alpha=0.5) +
# geom_smooth(method="lm", se=F) +
# geom_text_repel(aes(label=ifelse(delta.sig=="Y",as.character(tp),'')), show_guide=F, min.segment.length=0) +
# bartheme()
# print(p)
block_data_merge = block_data_merge[order(block_data_merge$node), ]
# Re-sort the data frame by R node order after the merge so the trees still work
p = ggplot(block_data_merge, aes(x=depth.bl, y=block.diff, color=delta.sig)) +
geom_point(size=2, alpha=0.5) +
geom_smooth(method="lm", se=F, linetype="dashed", color="#333333") +
xlab("Tree depth") +
ylab("Block size difference") +
scale_color_manual("Significant delta?", values=c(corecol(numcol=1, pal="wilke", offset=3), corecol(numcol=1, pal="wilke", offset=4))) +
bartheme() +
theme(legend.title=element_text(size=12),
legend.position="bottom",
plot.margin = unit(c(1,0,0,0), "cm"))
print(p)###############
fig_outfile = here("docs", "figs", "full-coding-delta-block-diff-depth.png")
ggsave(fig_outfile, p, width=6, height=4, unit="in")
# Save the figure5 Time tree
Tree was dated using IQ-tree’s implementation of least squares dating with the following fossil calibrations:
calibration_file = here("docs", "data", "trees", "astral", "iqtree-dating", "fossil-calibrations-iqtree.txt")
cal_dates = read.table(calibration_file, header=F)
names(cal_dates) = c("MRCA of", "dates")
# Read the fossil calibration dates
cal_dates = cal_dates %>% separate(dates, c("Min.age", "Max.age"), sep=":", remove=T)
cal_dates$Max.age = abs(as.numeric(cal_dates$Max.age))
cal_dates$Min.age = abs(as.numeric(cal_dates$Min.age))
# Parse the dates better
cal_dates %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)| MRCA of | Min.age | Max.age |
|---|---|---|
| mm10,Arvicanthus_niloticus_MNHN1999201 | 11.1 | 12.3 |
| Pithecheir_melanurus_LSUMZ38198,Arvicanthus_niloticus_MNHN1999201 | 8.7 | 10.1 |
| mm10,Mus_pahari_10460X1 | 7.3 | 8.3 |
| Leggadina_forresti_WAMM62323,Pseudomys_shortridgei_Z25113 | 4.0 | 4.5 |
# Display calibration points as a table
###############
#dated_tree_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nwk")
#dated_tree = read.tree(dated_tree_file)
#tree_to_df_list = treeToDF(rodent_tree)
#dated_info = tree_to_df_list[["info"]]
# Read the tree and parse with treetoDF
#plotTreeTime(dated_tree, runif(Ntip(dated_tree)))
dated_tree_file = here("docs", "data", "trees", "astral", "iqtree-dating", "full-coding-astral-iqtree-dating-anil-mmus-12.1.timetree.nex.mod")
dated_tree_text = readLines(con=dated_tree_file, n = -1)
dated_tree = readMCMCtree(dated_tree_text, from.file = FALSE)
# Read tree as nexus file# From: https://cran.r-project.org/web/packages/MCMCtreeR/vignettes/MCMCtree_plot.html
MCMC.tree.plot(dated_tree, cex.tips = 0.3, time.correction = 1, scale.res = c("Eon",
"Period", "Epoch", "Age"), plot.type = "phylogram", cex.age = 0.6, cex.labels = 0.6,
relative.height = 0.1, col.tree = "grey40", label.offset = 0.05,
node.method = "bar", no.margin = TRUE, label.timescale.names=TRUE, add.time.scale=TRUE, add.abs.time=TRUE,
lwd.bar=2)###############
fig_outfile = here("docs", "figs", "full-coding-timetree.png")
png(fig_outfile, width=12, height=12, units="in", res=320)
MCMC.tree.plot(dated_tree, cex.tips = 0.3, time.correction = 1, scale.res = c("Eon",
"Period", "Epoch", "Age"), plot.type = "phylogram", cex.age = 0.6, cex.labels = 0.6,
relative.height = 0.1, col.tree = "grey40", label.offset = 0.05,
node.method = "bar", no.margin = TRUE, label.timescale.names=TRUE, add.time.scale=TRUE, add.abs.time=TRUE,
lwd.bar=2)
dev.off()## png
## 2
6 Speciation rates with BAMM
taxo_file = here("docs", "data", "full-murinae-taxonomy.csv")
taxo_data = read.csv(taxo_file, header=T, quote="\"")
# Read the taxonomy data
###############
div_comp_sums = taxo_data %>% count(Division)
names(div_comp_sums)[2] = "Full"
div_samp_sums = subset(taxo_data, Exome==1) %>% count(Division)
names(div_samp_sums)[2] = "Sampled"
div_props = merge(div_comp_sums, div_samp_sums, by="Division")
div_props$div.prop.sampled = div_props$Sampled / div_props$Full
taxo_data = merge(taxo_data, select(div_props, Division, div.prop.sampled), by="Division")
# Get Division counts and sampling rates
div_sampling_file = here("docs", "data", "bamm", "div-sampling-rates.tab")
write.table(div_props, file=div_sampling_file, row.names=F, quote=F)
# Write the counts to a file
###############
tribe_comp_sums = taxo_data %>% count(Tribe)
names(tribe_comp_sums)[2] = "Full"
tribe_samp_sums = subset(taxo_data, Exome==1) %>% count(Tribe)
names(tribe_samp_sums)[2] = "Sampled"
tribe_props = merge(tribe_comp_sums, tribe_samp_sums, by="Tribe")
tribe_props$tribe.prop.sampled = tribe_props$Sampled / tribe_props$Full
taxo_data = merge(taxo_data, select(tribe_props, Tribe, tribe.prop.sampled), by="Tribe")
# Get Tribe counts and sampling rates
tribe_sampling_file = here("docs", "data", "bamm", "tribe-sampling-rates.tab")
write.table(tribe_props, file=tribe_sampling_file, row.names=F, quote=F)
# Write the counts to a file
###############sample_taxo_data = subset(taxo_data, Exome==1)
# Get the sampled species
###############
missing_samples = c("Lophuromys_woosnami_LSUMZ37793", "Lophiomys_imhausi_UM5152", "Mus_musculus-reference_10460X5", "Mus_musculus-domesticus_10460X15", "Mus_musculus-castaneus_10460X7", "Hydromys_sp-nov_YS391", "Genus_sp-nov_NMVZ21816", "Paruromys_dominator_JAE4870", "Mus_musculus-musculus_10460X14")
# A list of samples missing from the taxonomy file
tribe_prob_full = c("Pithecheir_melanurus_LSUMZ38198", "Vandeleuria_oleracea_ABTC116404")
tribe_prob = c("Pithecheir melanurus", "Vandeleuria oleracea")
div_prob_full = c("Coccymys_ruemmleri_ABTC49489", "Vandeleuria_oleracea_ABTC116404", "Anonymomys_mindorensis_FMNH2222010")
div_prob = c("Coccymys ruemmleri", "Vandeleuria oleracea", "Anonymomys mindorensis")
# Lists of samples with unclear taxonomy
###############
tribe_tree_file = here("docs", "data", "bamm", "full-coding-astral-timetree-tribe.tre")
time_tree = read.nexus(dated_tree_file)
tribe_tree = drop.tip(time_tree, c(missing_samples, tribe_prob_full))
write.tree(tribe_tree, file=tribe_tree_file)
# Write the tree for Tribe sampling
# NOTE: manually convert 0 branch lengths to 0.000001 (there are 5 of these for some reason)
tree_to_df_list = treeToDF(tribe_tree)
tribe_tree_info = tree_to_df_list[["info"]]
tribe_tree_tips = subset(tribe_tree_info, node.type=="tip")
# Read the tree and parse with treetoDF
tribe_tree_tips = tribe_tree_tips %>% separate(label, c("genus", "species", "sample"), sep="_", remove=F)
tribe_tree_tips$Genus.species = paste(tribe_tree_tips$genus, tribe_tree_tips$species)
tribe_tree_tips[tribe_tree_tips$label=="mm10",]$Genus.species = "Mus musculus"
tribe_tree_tips[tribe_tree_tips$label=="rnor6",]$Genus.species = "Rattus norvegicus"
# Parse the genus and species names of the remaining tips
###############
tribe_sample_file = here("docs", "data", "bamm", "tribe-sampling-rates.tab")
tribe_data = subset(sample_taxo_data, !Genus.species %in% tribe_prob)
tribe_data = merge(tribe_data, tribe_tree_tips, by="Genus.species")
tribe_data = select(tribe_data, label, Tribe, tribe.prop.sampled)
write.table(tribe_data, file=tribe_sample_file, sep="\t", row.names=F, col.names=F, quote=F)
# Write the Tribe sampling rates to a file
# NOTE: manually add 1.0 to the first line
# Format from: http://bamm-project.org/advanced.html#accounting-for-non-random-incomplete-taxon-sampling-in-diversification-studies
###############
# From: http://bamm-project.org/quickstart.html#speciation-extinction-analyses
taxo_data_nonextinct = subset(taxo_data, !grepl("EXTINCT",Extinct.TAXONOMY))
# Remove extinct species from complete list of species
prior_file = here("docs", "data", "bamm", "tribe-bamm-priors.txt")
setBAMMpriors(tribe_tree, total.taxa=nrow(taxo_data_nonextinct), outfile=prior_file)
# Generate BAMM priors based on the tree
## TRIBE
##########################################################################################
## DIVISION
div_tree_file = here("docs", "data", "bamm", "full-coding-astral-timetree-div.tre")
time_tree = read.nexus(dated_tree_file)
div_tree = drop.tip(time_tree, c(missing_samples, div_prob_full))
write.tree(div_tree, file=div_tree_file)
# Write the tree for Tribe sampling
# NOTE: manually convert 0 branch lengths to 0.000001 (there are 5 of these for some reason)
tree_to_df_list = treeToDF(div_tree)
div_tree_info = tree_to_df_list[["info"]]
div_tree_tips = subset(div_tree_info, node.type=="tip")
# Read the tree and parse with treetoDF
div_tree_tips = div_tree_tips %>% separate(label, c("genus", "species", "sample"), sep="_", remove=F)
div_tree_tips$Genus.species = paste(div_tree_tips$genus, div_tree_tips$species)
div_tree_tips[div_tree_tips$label=="mm10",]$Genus.species = "Mus musculus"
div_tree_tips[div_tree_tips$label=="rnor6",]$Genus.species = "Rattus norvegicus"
# Parse the genus and species names of the remaining tips
###############
div_sample_file = here("docs", "data", "bamm", "div-sampling-rates.tab")
div_data = subset(sample_taxo_data, !Genus.species %in% div_prob)
div_data = merge(div_data, div_tree_tips, by="Genus.species")
div_data = select(div_data, label, Division, div.prop.sampled)
write.table(div_data, file=div_sample_file, sep="\t", row.names=F, col.names=F, quote=F)
# Write the Tribe sampling rates to a file
# NOTE: manually add 1.0 to the first line
# Format from: http://bamm-project.org/advanced.html#accounting-for-non-random-incomplete-taxon-sampling-in-diversification-studies
###############
# From: http://bamm-project.org/quickstart.html#speciation-extinction-analyses
taxo_data_nonextinct = subset(taxo_data, !grepl("EXTINCT",Extinct.TAXONOMY))
# Remove extinct species from complete list of species
prior_file = here("docs", "data", "bamm", "div-bamm-priors.txt")
setBAMMpriors(div_tree, total.taxa=nrow(taxo_data_nonextinct), outfile=prior_file)
# Generate BAMM priors based on the tree
###############6.1 Sampling rates based on Tribe
tribe_props %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)| Tribe | Full | Sampled | tribe.prop.sampled |
|---|---|---|---|
| Apodemini | 26 | 1 | 0.0384615 |
| Arvicanthini | 90 | 22 | 0.2444444 |
| Hydromyini | 211 | 70 | 0.3317536 |
| incertae sedis | 11 | 2 | 0.1818182 |
| Malacomyini | 3 | 1 | 0.3333333 |
| Murini | 43 | 11 | 0.2558140 |
| Otomyini | 31 | 2 | 0.0645161 |
| Phloeomyini | 18 | 6 | 0.3333333 |
| Praomyini | 55 | 9 | 0.1636364 |
| Rattini | 193 | 55 | 0.2849741 |
6.2 Speciation rates based on Tribe sampling
# From: https://lukejharmon.github.io/ilhabela/instruction/2015/07/02/diversification-analysis-bamm-rpanda/
bamm_tree_file = here("docs", "data", "bamm", "full-coding-astral-timetree-tribe.tre")
bamm_tree = read.tree(bamm_tree_file)
# Read the tree from the BAMM run
event_data_file = here("docs", "data", "bamm", "tribe_event_data.txt")
events = read.csv(event_data_file)
# Read the event file from the BAMM run
###############
ed = getEventData(bamm_tree, events, burnin=0.25)## Processing event data from data.frame
##
## Discarded as burnin: GENERATIONS < 249000
## Analyzing 751 samples from posterior
##
## Setting recursive sequence on tree...
##
## Done with recursive sequence
best = getBestShiftConfiguration(ed, expectedNumberOfShifts=1, threshold=5)## Processing event data from data.frame
##
## Discarded as burnin: GENERATIONS < 0
## Analyzing 1 samples from posterior
##
## Setting recursive sequence on tree...
##
## Done with recursive sequence
# Get the best shift possibility (MAP)
# See: http://bamm-project.org/rateshifts.html#overall-best-shift-configuration
bamm_tree_p = plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)# Plot the rates and shifts on the tree
###############
fig_outfile = here("docs", "figs", "full-coding-bamm-tribe.png")
png(fig_outfile, width=12, height=12, units="in", res=320)
bamm_tree_p = plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)
dev.off()## png
## 2
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex.lab=1.5)###############
fig_outfile = here("docs", "figs", "full-coding-bamm-tribe-time.png")
png(fig_outfile, width=6, height=4, units="in", res=320)
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex=1.5)
dev.off()## png
## 2
6.3 Sampling rates based on Division
div_props %>% kable() %>% kable_styling(bootstrap_options=c("striped", "condended", "responsive"), full_width=F)| Division | Full | Sampled | div.prop.sampled |
|---|---|---|---|
| Aethomys | 11 | 3 | 0.2727273 |
| Apodemus | 26 | 1 | 0.0384615 |
| Arvicanthis | 30 | 7 | 0.2333333 |
| Berylmys | 4 | 2 | 0.5000000 |
| Bunomys | 31 | 15 | 0.4838710 |
| Chiropodomys | 6 | 1 | 0.1666667 |
| Chrotomys | 39 | 5 | 0.1282051 |
| Colomys | 4 | 2 | 0.5000000 |
| Conilurus | 7 | 6 | 0.8571429 |
| Dacnomys | 41 | 7 | 0.1707317 |
| Dasymys | 14 | 2 | 0.1428571 |
| Echiothrix | 10 | 9 | 0.9000000 |
| Haeromys | 3 | 1 | 0.3333333 |
| Hybomys | 11 | 5 | 0.4545455 |
| Hydromys | 32 | 9 | 0.2812500 |
| incertae sedis | 13 | 3 | 0.2307692 |
| Malacomys | 3 | 1 | 0.3333333 |
| Mallomys | 11 | 5 | 0.4545455 |
| Maxomys | 22 | 4 | 0.1818182 |
| Mus | 43 | 11 | 0.2558140 |
| Oenomys | 24 | 5 | 0.2083333 |
| Otomys | 31 | 2 | 0.0645161 |
| Phloeomys | 18 | 6 | 0.3333333 |
| Pithecheir | 3 | 1 | 0.3333333 |
| Pogonomys | 16 | 6 | 0.3750000 |
| Pseudomys | 41 | 32 | 0.7804878 |
| Rattus | 84 | 17 | 0.2023810 |
| Stenocephalemys | 51 | 7 | 0.1372549 |
| Uromys | 52 | 4 | 0.0769231 |
6.4 Speciation rates based on Division sampling
# From: https://lukejharmon.github.io/ilhabela/instruction/2015/07/02/diversification-analysis-bamm-rpanda/
bamm_tree_file = here("docs", "data", "bamm", "full-coding-astral-timetree-div.tre")
bamm_tree = read.tree(bamm_tree_file)
# Read the tree from the BAMM run
event_data_file = here("docs", "data", "bamm", "div_event_data.txt")
events = read.csv(event_data_file)
# Read the event file from the BAMM run
###############
ed = getEventData(bamm_tree, events, burnin=0.25)## Processing event data from data.frame
##
## Discarded as burnin: GENERATIONS < 249000
## Analyzing 751 samples from posterior
##
## Setting recursive sequence on tree...
##
## Done with recursive sequence
best = getBestShiftConfiguration(ed, expectedNumberOfShifts=1, threshold=5)## Processing event data from data.frame
##
## Discarded as burnin: GENERATIONS < 0
## Analyzing 1 samples from posterior
##
## Setting recursive sequence on tree...
##
## Done with recursive sequence
# Get the best shift possibility (MAP)
# See: http://bamm-project.org/rateshifts.html#overall-best-shift-configuration
bamm_tree_p = plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)# Plot the rates and shifts on the tree
###############
fig_outfile = here("docs", "figs", "full-coding-bamm-div.png")
png(fig_outfile, width=12, height=12, units="in", res=320)
bamm_tree_p = plot.bammdata(best, lwd=2, labels = T, cex = 0.3)
addBAMMshifts(best, cex=2)
addBAMMlegend(bamm_tree_p, nTicks = 4, side = 4, las = 1)
dev.off()## png
## 2
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex.lab=1.5)###############
fig_outfile = here("docs", "figs", "full-coding-bamm-div-time.png")
png(fig_outfile, width=6, height=4, units="in", res=320)
plotRateThroughTime(ed, intervalCol=corecol(numcol=1), avgCol=corecol(numcol=1), ylim=c(0,1), cex.axis=1, cex=1.5)
dev.off()## png
## 2